Researcher: Kirell Benzi, Benjamin Ricaud, Pierre Vandergheynst
From the paper: Principal Patterns on Graph
The dataset is available at SNAP.
In [15]:
import os
import time
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import datetime as dt
import itertools
import operator
import community
import matplotlib as mpl
import matplotlib.pyplot as plt
from collections import Counter
import IPython.utils.path
DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'data/higgs/')
print 'Data directory:', DATA_DIR
dataset_name = 'higgs'
# Install this from https://github.com/kikohs/sptgraph
import sptgraph
%load_ext autoreload
%autoreload 2
# Customize plot colors for dark backgrounds
%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'
In [ ]:
from bokeh.charts import Bar, show
from bokeh.plotting import output_notebook
output_notebook()
In [ ]:
def plot_spatial_spread_graph(comp):
nodes = sorted(comp.nodes(data=True), key=lambda x: x[1]['layer'])
group_hist = []
layers = []
for layer, n in itertools.groupby(nodes, key=lambda x: x[1]['layer']):
elems = list(n)
# Add timestamp instead of layer
# Bug in bokehJs, cannot use colons ...
t = elems[0][1]['timestamp'].strftime('%d-%m-%Y %H\'%M\'%S')
layers.append(t)
group_hist.append(len(elems))
bar = Bar(np.array(group_hist), layers, title="Spatial spread of component " + str(comp.name), stacked=False)
return bar
In [3]:
def round_df(df, freq, unit):
mul = 1
if unit == 'min':
mul = 60
unit = 'Min'
elif unit == 'hour':
mul = 60 * 60
unit = 'H'
elif unit == 'day':
mul = 60 * 60 * 24
unit = 'D'
else:
mul = 1
unit = 'S'
ns_min = freq * mul * 1000000000
idx = pd.DatetimeIndex(((df.index.astype(np.int64) // ns_min) * ns_min))
idx.name = df.index.name
df.index = idx
return df, unit
def create_layers(df, freq, unit):
# Trick: use pandas resample to generate continuous indexes
dfR = df.resample(str(freq) + unit)
layer_map = dict(itertools.izip(dfR.index.astype(np.int64), itertools.count()))
df['layer'] = np.vectorize(lambda x: layer_map[x])(df.index.astype(np.int64))
# delta between layers (timedelta)
delta = dfR.index[1] - dfR.index[0]
return df, delta
def create_layered_graph(input_df, freq=1, unit='min', action='RT', social_graph=None):
h = nx.DiGraph()
h.name = 'Higgs layered ' + action + ' - ' + str(freq) + ' ' + unit
df = input_df.copy()
if action != '' and action is not None:
df = df[df['action'] == action]
df, unit = round_df(df, freq, unit)
df, delta_ts = create_layers(df, freq, unit)
# Maximum id in whole dataframe, min should not be 0
max_id = np.max(np.max(df[['src_id', 'tgt_id']]).values)
for idx, row in df.iterrows():
base_src = row['src_id']
base_tgt = row['tgt_id']
layer = row['layer']
act = row['action']
src = base_src + (layer * max_id)
tgt = base_tgt + ((layer + 1) * max_id)
# Add nodes
h.add_node(src, {'base_id': base_src, 'layer': layer, 'timestamp': idx})
h.add_node(tgt, {'base_id': base_tgt, 'layer': layer + 1, 'timestamp': idx + delta_ts})
# Add edge
e_d = {'action': act, 'timestamp': idx}
if social_graph is not None:
if social_graph.has_edge(base_src, base_tgt):
e_d['is_social'] = True
else:
e_d['is_social'] = False
h.add_edge(src, tgt, e_d)
return h, df
def extract_components(h):
components = filter(lambda x: x.number_of_edges() >= 1 and x.number_of_nodes() >= 2,
nx.weakly_connected_component_subgraphs(h))
res = []
for i, comp in enumerate(components):
comp.name = i
res.append({'component': comp})
df = pd.DataFrame(res)
df.index.name = 'component_id'
return df
def enrich_components(df, with_social=False):
def get_period_span(c):
ts = sorted(nx.get_node_attributes(c, 'timestamp').values())
return (ts[0], ts[-1])
def get_social_edge_ratio(c):
social_edges = sum(nx.get_edge_attributes(c, 'is_social').values()) # True == 1, False 0
return social_edges / float(c.number_of_edges())
df['node_count'] = df['component'].apply(lambda x: x.number_of_nodes())
df['edge_count'] = df['component'].apply(lambda x: x.number_of_edges())
df['height'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'base_id').values())))
df['width'] = df['component'].apply(lambda x: len(np.unique(nx.get_node_attributes(x, 'layer').values())))
period_series = df['component'].apply(get_period_span)
df['start'] = period_series.apply(lambda x: x[0])
df['end'] = period_series.apply(lambda x: x[1])
df['social_ratio'] = df['component'].apply(get_social_edge_ratio)
return df.sort('node_count', ascending=False)
def create_activated_components(input_df, freq=1, unit='min', action='RT', social_graph=None):
h, _ = create_layered_graph(input_df, freq, unit, action, social_graph)
comp_df = extract_components(h)
with_social = True if social_graph is not None else False
return enrich_components(comp_df, with_social)
def create_graph_from_activity(activity_df, action='RT'):
g = nx.DiGraph()
df = activity_df[activity_df['action'] == action]
g.name = 'Higgs ' + action
for idx, d in df.iterrows():
src = d['src_id']
tgt = d['tgt_id']
if not g.has_edge(src, tgt):
g.add_edge(src, tgt, weight=1)
else:
g[src][tgt]['weight'] += 1
return g
def overlap_graph(g1, g2):
common_edges = 0
for u, v in g1.edges_iter():
if g2.has_edge(u, v):
common_edges += 1
res = common_edges * 100 / float(nx.number_of_edges(g1))
print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
print 'Number of common edges:', common_edges
return res, common_edges
def parse_activity(path, reverse=True):
"""if reverse is True. edges to keep the causality from cause to effet"""
names = ['src_id', 'tgt_id', 'timestamp', 'action']
if reverse:
names = ['tgt_id', 'src_id', 'timestamp', 'action']
df = pd.read_csv(path, sep=' ', header=None, names=names,
dtype={'src_id': np.int64, 'tgt_id': np.int64, 'timestamp': np.int64, 'action': str},
index_col=2)
df['action'] = df['action'].astype(str)
df.index = df.index.astype('datetime64[s]')
df.index.name = 'timestamp'
return df
In [ ]:
start = time.time()
# path = os.path.join(DATA_DIR, 'HiggsDiscovery_RT.edges.gz')
# RETWEET = nx.read_edgelist(path, create_using=nx.DiGraph(),
# nodetype=int, data=(('weight', int),))
# RETWEET.name = 'Higgs RT'
# nx.write_gpickle(RETWEET, os.path.join(DATA_DIR, 'retweet.gpickle'))
RETWEET = nx.read_gpickle(os.path.join(DATA_DIR, 'retweet.gpickle'))
print 'Retweet graph loaded in:', time.time() - start
print nx.info(RETWEET)
In [ ]:
start = time.time()
# MENTION = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_MT.edges.gz'),
# create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# MENTION.name = 'Higgs MT'
# nx.write_gpickle(MENTION, os.path.join(DATA_DIR, 'mention.gpickle'))
MENTION = nx.read_gpickle(os.path.join(DATA_DIR, 'mention.gpickle'))
print 'Mention graph loaded in:', time.time() - start
print nx.info(MENTION)
In [ ]:
start = time.time()
# REPLY = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_RE.edges.gz'),
# create_using=nx.DiGraph(), nodetype=int, data=(('weight', int),))
# REPLY.name = 'Higgs RE'
# nx.write_gpickle(REPLY, os.path.join(DATA_DIR, 'reply.gpickle'))
REPLY = nx.read_gpickle(os.path.join(DATA_DIR, 'reply.gpickle'))
print 'Reply graph loaded in:', time.time() - start
print nx.info(REPLY)
In [ ]:
start = time.time()
# SOCIAL = nx.read_edgelist(os.path.join(DATA_DIR, 'HiggsDiscovery_social.edges.gz'),
# create_using=nx.DiGraph(), nodetype=int, )
# SOCIAL.name = 'Higgs SOCIAL'
# nx.write_gpickle(SOCIAL, os.path.join(DATA_DIR, 'social.gpickle'))
SOCIAL = nx.read_gpickle(os.path.join(DATA_DIR, 'social.gpickle'))
print 'Social graph loaded in:', time.time() - start
print nx.info(SOCIAL)
In [4]:
start = time.time()
# SOCIAL_RE = nx.reverse(SOCIAL)
# SOCIAL_RE.name = 'Higgs SOCIAL reversed'
# nx.write_gpickle(SOCIAL_RE, os.path.join(DATA_DIR, 'social_re.gpickle'))
SOCIAL_RE = nx.read_gpickle(os.path.join(DATA_DIR, 'social_re.gpickle'))
print 'Social reversed graph loaded in:', time.time() - start
print nx.info(SOCIAL_RE)
In [5]:
ACTIVITY = parse_activity(os.path.join(DATA_DIR, 'HiggsDiscovery_multiplex_time.txt'), reverse=True)
# COMPS = create_activated_components(ACTIVITY, 10, 'min', 'RT', SOCIAL_RE)
In [ ]:
FILT_COMPS = COMPS[COMPS['node_count'] > 20] # filter too small components
C = FILT_COMPS.set_index('start', drop=False).sort_index()
# Resample do get statistics (averages the values)
C.resample('12H')
In [ ]:
fig = plot_spatial_spread(FILT_COMPS.iloc[0]['component'])
show(fig)
In [19]:
def create_signal(input_df, freq=1, unit='min', action='RT', subset=None):
df = input_df.copy()
if action != '' and action is not None:
df = df[df['action'] == action]
df, unit = round_df(df, freq, unit)
df, delta_ts = create_layers(df, freq, unit)
df.reset_index(drop=False, inplace=True)
res = None
if subset is None:
src_df = df[['src_id', 'layer', 'timestamp']].rename(columns = {'src_id':'base_id'})
tgt_df = df[['tgt_id', 'layer', 'timestamp']].rename(columns = {'tgt_id':'base_id'})
tgt_df['layer'] = tgt_df['layer'].apply(lambda x: x + 1)
tgt_df['timestamp'] = tgt_df['timestamp'].apply(lambda x: x + delta_ts)
res = pd.concat([src_df, tgt_df], ignore_index=True) \
.sort('layer').reset_index(drop=True).drop_duplicates()
else:
subset_df = df[[subset, 'layer', 'timestamp']].rename(columns = {subset: 'base_id'})
res = subset_df.sort('layer').reset_index(drop=True).drop_duplicates()
return res
def get_period_span_gl(df, signal):
def impl(row):
min_layer = layer_to_ts[np.min(row)]
max_layer = layer_to_ts[np.max(row)]
return [min_layer, max_layer]
layer_to_ts = dict(signal[['layer', 'timestamp']].values)
period_series = df['layers'].apply(impl)
df['start'] = period_series.apply(lambda x: x[0])
df['end'] = period_series.apply(lambda x: x[1])
return df
def extract_components_gl(signal, graph, create_self_edge=True):
node_signal = sptgraph.utils.from_pandas(signal)
h, max_id = sptgraph.create_spatio_temporal_graph(graph, node_signal, create_self_edge, 'base_id', 'layer')
h = sptgraph.find_connected_components(h)
comps = sptgraph.get_component_sframe(h, 'base_id', 'layer')
return get_period_span_gl(comps, signal), h, max_id
In [8]:
SOCIAL_GL_RE = sptgraph.utils.networkx_to_graphlab(SOCIAL_RE)
In [20]:
signal_10min_rt_tgt = create_signal(ACTIVITY, 10, 'min', 'RT', subset='tgt_id')
gl_comps_10min_rt_tgt, H, max_id = extract_components_gl(signal_10min_rt_tgt, SOCIAL_GL_RE)
layer_to_ts = dict(signal_10min_rt_tgt[['layer', 'timestamp']].values)
In [21]:
gl_comps_10min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]
Out[21]:
In [11]:
big_comp = gl_comps_10min_rt_tgt[0]
In [22]:
def plot_component_spatial_spread(comp, layer_to_ts, size=(10, 10), savepath=None):
hist = Counter(sorted(big_comp['layers']))
x, y = zip(*sorted(hist.items(), key=operator.itemgetter(0)))
comp_time = map(lambda t: (t, layer_to_ts[t]), x)
# Get xticks by hour
xticks, labels = zip(*[(k, v.strftime('%H-%M')) for (k, v) in comp_time if v.minute == 0])
fig = plt.figure(0)
fig.set_size_inches(size)
ax = plt.plot(x, y)
plt.xticks(xticks, labels, rotation=45)
plt.xlabel('Time')
plt.ylabel('Number of retweets')
plt.title('Spatial spread of component {}'.format(comp['component_id']))
plt.show()
# plot_component_spatial_spread(big_comp, layer_to_ts)
In [92]:
g = sptgraph.component_to_networkx(big_comp, H, layer_to_ts)
print nx.info(g)
In [80]:
g_un = g.to_undirected()
partition = community.best_partition(g_un)
In [81]:
print 'Number of communities:', len(set(partition.values()))
In [82]:
print 'Modularity:', community.modularity(partition, g_un)
In [93]:
path = os.path.join(DATA_DIR, g.name)
# nx.write_edgelist(g, path + '.csv', data=False)
nx.write_graphml(g, path + '.graphml')
In [ ]:
signal_1min_rt_tgt = create_signal(ACTIVITY, 1, 'min', 'RT', subset='tgt_id')
gl_comps_1min_rt_tgt = extract_components_gl(signal_1min_rt_tgt, SOCIAL_GL_RE)
In [ ]:
gl_comps_1min_rt_tgt[['component_id', 'node_count', 'width', 'height', 'start', 'end']]
In [ ]:
def overlap_graph(g1, g2):
common_edges = 0
for u, v in g1.edges_iter():
if g2.has_edge(u, v):
common_edges += 1
res = common_edges * 100 / float(nx.number_of_edges(g1))
print 'Percentage of overlap (', g1.name, ',', g2.name, '):', res
print 'Number of common edges:', common_edges
return res, common_edges
# overlap_graph(REPLY, SOCIAL)
overlap_graph(RETWEET, SOCIAL)
# overlap_graph(MENTION, SOCIAL)